import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
dataset = pd.read_csv("C:/Users/Renick Andrews/Downloads/bank-full.csv")
dataset.shape
dataset.columns
dataset.head(10)
dataset.dtypes
dataset.isnull().sum()
dataset.describe().transpose()
dataset.pdays=dataset.pdays.replace(-1,1)
dataset.head()
sns.boxplot(dataset['age'])
sns.boxplot(dataset['balance'])
sns.boxplot(dataset['day'])
sns.boxplot(dataset['duration'])
sns.boxplot(dataset['pdays'])
sns.boxplot(dataset['campaign'])
sns.boxplot(dataset['previous'])
pd.value_counts(dataset["Target"]).plot(kind="bar")
dataset["Target"].value_counts(normalize=True)
dataset.groupby(dataset["Target"]).count()
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
dataset['month'] = label.fit_transform(dataset['month'])
dataset['poutcome'] = label.fit_transform(dataset['poutcome'])
dataset['job'] = label.fit_transform(dataset['job'])
dataset['marital'] = label.fit_transform(dataset['marital'])
dataset['education'] = label.fit_transform(dataset['education'])
dataset['default'] = label.fit_transform(dataset['default'])
dataset['contact'] = label.fit_transform(dataset['contact'])
dataset['housing'] = label.fit_transform(dataset['housing'])
dataset['loan'] = label.fit_transform(dataset['loan'])
dataset['Target'] = label.fit_transform(dataset['Target'])
sns.pairplot(dataset, diag_kind="kde")
plt.figure(figsize=(15,15))
sns.heatmap(dataset.corr(),annot=True)
dataframe = pd.DataFrame(dataset)
dataframe.head()
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataframe['job'] = le.fit_transform(dataframe['job'])
dataframe['marital'] = le.fit_transform(dataframe['marital'])
dataframe['education'] = le.fit_transform(dataframe['education'])
dataframe['default'] = le.fit_transform(dataframe['default'])
dataframe['contact'] = le.fit_transform(dataframe['contact'])
dataframe['housing'] = le.fit_transform(dataframe['housing'])
dataframe['loan'] = le.fit_transform(dataframe['loan'])
dataframe['month'] = le.fit_transform(dataframe['month'])
dataframe['poutcome'] = le.fit_transform(dataframe['poutcome'])
dataframe['Target'] = le.fit_transform(dataframe['Target'])
dataframe.head()
dataframe.dtypes
dataframe.job=dataframe.job.astype('category')
dataframe.education=dataframe.education.astype('category')
dataframe.marital=dataframe.marital.astype('category')
dataframe.default=dataframe.default.astype('category')
dataframe.contact=dataframe.contact.astype('category')
dataframe.housing=dataframe.housing.astype('category')
dataframe.loan=dataframe.loan.astype('category')
dataframe.month=dataframe.month.astype('category')
dataframe.poutcome=dataframe.poutcome.astype('category')
dataframe.Target=dataframe.Target.astype('category')
dataframe.dtypes
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
X_std = pd.DataFrame(StandardScaler().fit_transform(dataframe.values))
X_std.columns = dataframe.columns
X_dataframe = dataframe.loc[:, dataframe.columns != 'Target']
y_dataframe = dataframe.loc[:, dataframe.columns == 'Target']
from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
features=X_dataframe.iloc[:, 0:16]
features_array = features.values
target_labels = y_dataframe.values
X_train, X_test, y_train, y_test = model_selection.train_test_split(features_array, target_labels, test_size=0.3, random_state=1)
print("{0:0.2f}% data is in training set".format((len(X_train)/len(dataframe.index))*100))
print("{0:0.2f}% data is in test set".format((len(X_test)/len(dataframe.index))*100))
print(X_train.shape)
print(y_train.shape)
model = LogisticRegression()
model.fit(X_train, y_train)
model_score = model.score(X_test, y_test)
y_predict = model.predict(X_test)
print(model_score)
print(metrics.confusion_matrix(y_test, y_predict))
print(metrics.classification_report(y_test, y_predict))
model = GaussianNB()
model.fit(X_train, y_train)
predictions=model.predict(X_test)
print(model_score)
print(metrics.confusion_matrix(y_test,predictions))
expected = y_test
predicted = model.predict(X_test)
print(metrics.classification_report(expected, predicted))
from sklearn.neighbors import KNeighborsClassifier
NNH = KNeighborsClassifier(n_neighbors= 3 , weights = 'distance')
NNH.fit(X_train, y_train)
predicted_labels = NNH.predict(X_test)
print(model_score)
print(metrics.confusion_matrix(y_test, predicted_labels))
print(metrics.classification_report(y_test, predicted_labels))
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(X_train, y_train)
print(dTree.score(X_train, y_train))
print(dTree.score(X_test, y_test))
model_entropy=DecisionTreeClassifier(criterion='entropy')
model_entropy.fit(X_train, y_train)
print(model_entropy.score(X_train, y_train))
print(model_entropy.score(X_test, y_test))
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from IPython.display import Image
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from sklearn import tree
from os import system
import pydotplus
import graphviz
X_train, X_test, y_train, y_test = train_test_split(X_dataframe, y_dataframe, test_size=.30, random_state=1)
train_char_label = ['No', 'Yes']
dataframe_Tree_File = open('df_tree.dot','w')
dot_data = tree.export_graphviz(model_entropy, out_file=dataframe_Tree_File, feature_names = list(X_train), class_names = list(train_char_label))
dataframe_Tree_File.close()
retCode = system("dot -Tpng df_tree.dot -o df_tree.png")
if(retCode>0):
print("system command returning error: "+str(retCode))
else:
display(Image("df_tree.png"))
dTree = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
dTree.fit(X_train, y_train)
print(dTree.score(X_train, y_train))
print(dTree.score(X_test, y_test))
train_char_label = ['No', 'Yes']
df_Tree_File = open('df_treeR.dot','w')
dot_data = tree.export_graphviz(dTree, out_file=df_Tree_File, feature_names = list(X_dataframe), class_names = list(train_char_label))
df_Tree_File.close()
retCode = system("dot -Tpng df_treeR.dot -o df_treeR.png")
if(retCode>0):
print("system command returning error: "+str(retCode))
else:
display(Image("df_treeR.png"))
print (pd.DataFrame(dTree.feature_importances_, columns = ["Imp"], index = X_dataframe.columns))
print(dTree.score(X_test , y_test))
y_predict = dTree.predict(X_test)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(base_estimator=dTree, n_estimators=50,random_state=1)
bgcl = bgcl.fit(X_train, y_train)
y_predict = bgcl.predict(X_test)
print(bgcl.score(X_test , y_test))
cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators=50, random_state=1)
abcl = abcl.fit(X_train, y_train)
y_predict = abcl.predict(X_test)
print(abcl.score(X_test , y_test))
cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50,random_state=1)
gbcl = gbcl.fit(X_train, y_train)
y_predict = gbcl.predict(X_test)
print(gbcl.score(X_test, y_test))
cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50, random_state=1,max_features=12)
rfcl = rfcl.fit(X_train, y_train)
y_predict = rfcl.predict(X_test)
print(rfcl.score(X_test, y_test))
cm=metrics.confusion_matrix(y_test, y_predict,labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')